###########################################
d2020 <- rbind(fread("raw_data/death_data/DeathNamesF2020.csv.D230105.T132230.csv"),
               fread("raw_data/death_data/DeathNamesF2021.csv.D230105.T132237.csv"))
colnames(d2020) <- clean_names(d2020)

d2020 <- mutate(d2020,
                residence_street = toupper(gsub('[[:punct:] ]+', ' ', residence_street)))

d2020 <- filter(d2020, !grepl("UNKNOWN", residence_street),
                !grepl("P O BOX", residence_street),
                !grepl("PO BOX", residence_street),
                !grepl("POBOX", residence_street),
                !grepl("HOMELESS", residence_street),
                !grepl("TRANSIENT", residence_street),
                !grepl("NO PERMANENT ADDRESS", residence_street),
                trimws(residence_street) != "",
                residence_state == "WASHINGTON")

d2020 <- mutate(d2020, residence_street = trimws(gsub("\\s+", " ", residence_street)))

d2020 <- mutate(d2020, residence_street = gsub(" APT ", " ", residence_street))
d2020 <- mutate(d2020, residence_street = gsub(" APARTMENT ", " ", residence_street))

ad_cl <- fread("raw_data/address_cleaner.csv") |> 
  filter(search != replace) |> 
  mutate(across(c(search, replace), toupper)) |> 
  distinct()

for(i in c(1:nrow(ad_cl))){
  print(i)
  d2020$residence_street <- gsub(paste0("(^|\\s)", ad_cl$search[i], "(\\s|$)"),
                                 paste0("\\1", ad_cl$replace[i], "\\2"),
                                 d2020$residence_street)
}

d2020$residence_street <- gsub("NORTH", "N", d2020$residence_street)
d2020$residence_street <- gsub("SOUTH", "S", d2020$residence_street)
d2020$residence_street <- gsub("EAST", "E", d2020$residence_street)
d2020$residence_street <- gsub("WEST", "W", d2020$residence_street)

d2020 <- d2020 |> 
  mutate(across(c("decedent_last_name", "decedent_first_name"),
                ~ gsub("[[:punct:]]| ", "", ifelse(. == "", NA, tolower(iconv(., "WINDOWS-1252", "UTF-8"))))),
         residence_city = toupper(gsub("[[:punct:]]", "", residence_city)))

d2020 <- d2020 |> 
  extract(residence_street, into = c("num", "rest"), "(.*?)\\s+(.*)", remove = FALSE)

saveRDS(d2020, "temp/cleaned_death.rds")

d2020 <- readRDS("temp/cleaned_death.rds")

#########################################
k <- readRDS("temp/cleaned_file.rds")

k <- k |> 
  mutate(across(c("Residence_Addresses_City"),
                ~ toupper(gsub("[[:punct:]]", "", .))))

k$street <- gsub("NORTH", "N", k$street)
k$street <- gsub("SOUTH", "S", k$street)
k$street <- gsub("EAST", "E", k$street)
k$street <- gsub("WEST", "W", k$street)
k$street <- gsub(" APARTMENT ", " ", k$street)
k$street <- gsub(" APT ", " ", k$street)

k <- filter(k,
            !(paste0(Voters_BirthDate,
                     street, Residence_Addresses_City)) %in% 
              with(d2020, paste0(date_of_birth, residence_street, residence_city)))

k <- k |> 
  extract(street, into = c("num", "rest"), "(.*?)\\s+(.*)", remove = FALSE)


k$rest <- paste(k$rest, k$Residence_Addresses_City)

d2020 <- d2020 |> 
  mutate(death_date = as.Date(date_of_death, "%m/%d/%Y"),
         rest = paste(rest, residence_city)) |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03")

good <- filter(d2020, rest %in% k$rest)

bad <- filter(d2020, !(rest %in% k$rest))

bad_ads <- bad |> 
  group_by(rest) |> 
  tally()

bad_ads$best <- amatch(bad_ads$rest,
                       k$rest,
                       maxDist = 3, method = "lv")

bad_ads$new <- k$rest[bad_ads$best]

bad_ads$dist <- stringdist(bad_ads$rest, bad_ads$new, method = "lv")

bad <- left_join(bad, bad_ads) |> 
  select(-best, -n)

good <- good |> 
  mutate(dist = 0,
         new = rest)

full <- bind_rows(good, bad)

saveRDS(full, "temp/wa_death_with_dists.rds")

#######################################################
#######################################################
#######################################################

one_per <- readRDS("temp/wa_death_with_dists.rds") |> 
  mutate(death_date = as.Date(date_of_death, "%m/%d/%Y"),
         pre = death_date < "2021-05-03") |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03",
         dist == 0) |> 
  group_by(residence_street, residence_city, pre) |> 
  summarize(across(c(death_date, age), mean),
            covid = max(underlying_cod_code == "U071"),
            n_dead = n(),
            last_name = min(decedent_last_name))

tre <- filter(one_per, pre)

pra <- filter(one_per, !pre)

#####################################


k_m <- left_join(k, tre,
               by = c("street" = "residence_street",
                      "Residence_Addresses_City" = "residence_city"))


k_m <- mutate(k_m,
            n_dead = ifelse(is.na(n_dead), 0, n_dead),
            treated = ifelse(is.na(covid), 0,
                             ifelse(covid, 2, 1)))
saveRDS(filter(k_m, treated > 0), "temp/vf.rds")


k_m2 <- k_m |> 
  filter(treated == 0) |> 
  select(-death_date, -age, -covid, -n_dead,
         -last_name, -treated, -num, -rest, -pre)


k_m2 <- inner_join(k_m2, pra,
               by = c("street" = "residence_street",
                      "Residence_Addresses_City" = "residence_city"))

saveRDS(k_m2, "temp/vf_later.rds")

cleanup("k")
#################################################
#################################################
#################################################

one_per <- readRDS("temp/wa_death_with_dists.rds") |> 
  mutate(death_date = as.Date(date_of_death, "%m/%d/%Y"),
         pre = death_date < "2021-05-03") |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03",
         dist < 3, !is.na(dist)) |> 
  mutate(rest = new) |> 
  group_by(num, rest, pre) |> 
  summarize(across(c(death_date, age), mean),
            covid = max(underlying_cod_code == "U071"),
            n_dead = n(),
            last_name = min(decedent_last_name))

tre <- filter(one_per, pre)

pra <- filter(one_per, !pre)

#####################################


k_m <- left_join(k, tre,
                 by = c("num", "rest"))


k_m <- mutate(k_m,
              n_dead = ifelse(is.na(n_dead), 0, n_dead),
              treated = ifelse(is.na(covid), 0,
                               ifelse(covid, 2, 1)))

saveRDS(filter(k_m, treated > 0), "temp/vf_wa_dist.rds")


k_m2 <- k_m |> 
  filter(treated == 0) |> 
  select(-death_date, -age, -covid, -n_dead,
         -last_name, -treated, -pre)


k_m2 <- inner_join(k_m2, pra,
                   by = c("num", "rest"))

saveRDS(k_m2, "temp/vf_later_wa_dist.rds")
